{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/users/fyx/.local/python3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "/data/users/fyx/.local/python3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
      "  return f(*args, **kwds)\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import matchzoo as mz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pack = mz.datasets.wiki_qa.load_data('train', task='ranking')\n",
    "valid_pack = mz.datasets.wiki_qa.load_data('dev', task='ranking', filter=True)\n",
    "predict_pack = mz.datasets.wiki_qa.load_data('test', task='ranking', filter=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 2118/2118 [00:00<00:00, 8457.68it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 18841/18841 [00:04<00:00, 4591.87it/s]\n",
      "Processing text_right with append: 100%|██████████| 18841/18841 [00:00<00:00, 735026.29it/s]\n",
      "Building FrequencyFilterUnit from a datapack.: 100%|██████████| 18841/18841 [00:00<00:00, 130835.25it/s]\n",
      "Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 79521.17it/s] \n",
      "Processing text_left with extend: 100%|██████████| 2118/2118 [00:00<00:00, 435147.48it/s]\n",
      "Processing text_right with extend: 100%|██████████| 18841/18841 [00:00<00:00, 641170.31it/s]\n",
      "Building VocabularyUnit from a datapack.: 100%|██████████| 404415/404415 [00:00<00:00, 2697710.53it/s]\n",
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 2118/2118 [00:00<00:00, 8388.11it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 18841/18841 [00:04<00:00, 4623.86it/s]\n",
      "Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 117352.76it/s]\n",
      "Processing text_left with transform: 100%|██████████| 2118/2118 [00:00<00:00, 132812.10it/s]\n",
      "Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 119010.51it/s]\n",
      "Processing length_left with len: 100%|██████████| 2118/2118 [00:00<00:00, 538037.42it/s]\n",
      "Processing length_right with len: 100%|██████████| 18841/18841 [00:00<00:00, 608412.56it/s]\n",
      "Processing text_left with transform: 100%|██████████| 2118/2118 [00:00<00:00, 112685.18it/s]\n",
      "Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 79215.55it/s]\n"
     ]
    }
   ],
   "source": [
    "preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False)\n",
    "train_pack_processed = preprocessor.fit_transform(train_pack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 122/122 [00:00<00:00, 8031.41it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 1115/1115 [00:00<00:00, 4764.18it/s]\n",
      "Processing text_right with transform: 100%|██████████| 1115/1115 [00:00<00:00, 95609.62it/s]\n",
      "Processing text_left with transform: 100%|██████████| 122/122 [00:00<00:00, 116694.43it/s]\n",
      "Processing text_right with transform: 100%|██████████| 1115/1115 [00:00<00:00, 74636.51it/s]\n",
      "Processing length_left with len: 100%|██████████| 122/122 [00:00<00:00, 130072.47it/s]\n",
      "Processing length_right with len: 100%|██████████| 1115/1115 [00:00<00:00, 455015.47it/s]\n",
      "Processing text_left with transform: 100%|██████████| 122/122 [00:00<00:00, 60650.12it/s]\n",
      "Processing text_right with transform: 100%|██████████| 1115/1115 [00:00<00:00, 84874.12it/s]\n",
      "Processing text_left with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 237/237 [00:00<00:00, 9071.62it/s]\n",
      "Processing text_right with chain_transform of TokenizeUnit => LowercaseUnit => PuncRemovalUnit: 100%|██████████| 2300/2300 [00:00<00:00, 4612.53it/s]\n",
      "Processing text_right with transform: 100%|██████████| 2300/2300 [00:00<00:00, 127373.66it/s]\n",
      "Processing text_left with transform: 100%|██████████| 237/237 [00:00<00:00, 137223.92it/s]\n",
      "Processing text_right with transform: 100%|██████████| 2300/2300 [00:00<00:00, 132341.47it/s]\n",
      "Processing length_left with len: 100%|██████████| 237/237 [00:00<00:00, 226383.52it/s]\n",
      "Processing length_right with len: 100%|██████████| 2300/2300 [00:00<00:00, 543211.85it/s]\n",
      "Processing text_left with transform: 100%|██████████| 237/237 [00:00<00:00, 90179.63it/s]\n",
      "Processing text_right with transform: 100%|██████████| 2300/2300 [00:00<00:00, 16596.30it/s]\n"
     ]
    }
   ],
   "source": [
    "valid_pack_processed = preprocessor.transform(valid_pack)\n",
    "predict_pack_processed = preprocessor.transform(predict_pack)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())\n",
    "ranking_task.metrics = [\n",
    "    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),\n",
    "    mz.metrics.NormalizedDiscountedCumulativeGain(k=5),\n",
    "    mz.metrics.MeanAveragePrecision()\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parameter \"name\" set to ArcII.\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "text_left (InputLayer)          (None, 10)           0                                            \n",
      "__________________________________________________________________________________________________\n",
      "text_right (InputLayer)         (None, 100)          0                                            \n",
      "__________________________________________________________________________________________________\n",
      "embedding (Embedding)           multiple             5002200     text_left[0][0]                  \n",
      "                                                                 text_right[0][0]                 \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_1 (Conv1D)               (None, 10, 32)       28832       embedding[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_2 (Conv1D)               (None, 100, 32)      28832       embedding[1][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "matching_layer_1 (MatchingLayer (None, 10, 100, 32)  0           conv1d_1[0][0]                   \n",
      "                                                                 conv1d_2[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "conv2d_1 (Conv2D)               (None, 10, 100, 64)  18496       matching_layer_1[0][0]           \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling2d_1 (MaxPooling2D)  (None, 3, 33, 64)    0           conv2d_1[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "conv2d_2 (Conv2D)               (None, 3, 33, 64)    36928       max_pooling2d_1[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling2d_2 (MaxPooling2D)  (None, 1, 11, 64)    0           conv2d_2[0][0]                   \n",
      "__________________________________________________________________________________________________\n",
      "flatten_1 (Flatten)             (None, 704)          0           max_pooling2d_2[0][0]            \n",
      "__________________________________________________________________________________________________\n",
      "dropout_1 (Dropout)             (None, 704)          0           flatten_1[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "dense_1 (Dense)                 (None, 1)            705         dropout_1[0][0]                  \n",
      "==================================================================================================\n",
      "Total params: 5,115,993\n",
      "Trainable params: 5,115,993\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = mz.models.ArcII()\n",
    "model.params['input_shapes'] = preprocessor.context['input_shapes']\n",
    "model.params['task'] = ranking_task\n",
    "model.params['embedding_input_dim'] = preprocessor.context['vocab_size']\n",
    "model.params['embedding_output_dim'] = 300\n",
    "model.params['embedding_trainable'] = True\n",
    "model.params['num_blocks'] = 2\n",
    "model.params['kernel_1d_count'] = 32\n",
    "model.params['kernel_1d_size'] = 3\n",
    "model.params['kernel_2d_count'] = [64, 64]\n",
    "model.params['kernel_2d_size'] = [3, 3]\n",
    "model.params['pool_2d_size'] = [[3, 3], [3, 3]]\n",
    "model.params['optimizer'] = 'adam'\n",
    "model.guess_and_fill_missing_params()\n",
    "model.build()\n",
    "model.compile()\n",
    "model.backend.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)\n",
    "embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_embedding_matrix(embedding_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2341"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_x, pred_y = predict_pack_processed[:].unpack()\n",
    "evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y))\n",
    "len(pred_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "102"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_generator = mz.PairDataGenerator(train_pack_processed, num_dup=2, num_neg=1, batch_size=20)\n",
    "len(train_generator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/30\n",
      "102/102 [==============================] - 14s 142ms/step - loss: 0.5740\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5993770560131093 - normalized_discounted_cumulative_gain@5(0): 0.650594665697349 - mean_average_precision(0): 0.6051205709433557\n",
      "Epoch 2/30\n",
      "102/102 [==============================] - 13s 126ms/step - loss: 0.1524\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5567182600732954 - normalized_discounted_cumulative_gain@5(0): 0.6180917374441505 - mean_average_precision(0): 0.5654610242446931\n",
      "Epoch 3/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0343\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5913961064490625 - normalized_discounted_cumulative_gain@5(0): 0.6537170030449837 - mean_average_precision(0): 0.6101269001399305\n",
      "Epoch 4/30\n",
      "102/102 [==============================] - 13s 126ms/step - loss: 0.0149\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5717007371002853 - normalized_discounted_cumulative_gain@5(0): 0.6366372272196775 - mean_average_precision(0): 0.5822640247692663\n",
      "Epoch 5/30\n",
      "102/102 [==============================] - 13s 126ms/step - loss: 0.0175\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5765647979354743 - normalized_discounted_cumulative_gain@5(0): 0.6426610380482014 - mean_average_precision(0): 0.5941299494252578\n",
      "Epoch 6/30\n",
      "102/102 [==============================] - 13s 126ms/step - loss: 0.0219\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5758109240875372 - normalized_discounted_cumulative_gain@5(0): 0.6479728084931065 - mean_average_precision(0): 0.596952423455588\n",
      "Epoch 7/30\n",
      "102/102 [==============================] - 13s 125ms/step - loss: 0.0174\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5746173317595609 - normalized_discounted_cumulative_gain@5(0): 0.6414961271711739 - mean_average_precision(0): 0.5893993380750397\n",
      "Epoch 8/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0092\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5744891994604373 - normalized_discounted_cumulative_gain@5(0): 0.6416719323222839 - mean_average_precision(0): 0.5886615548404748\n",
      "Epoch 9/30\n",
      "102/102 [==============================] - 13s 125ms/step - loss: 0.0278\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5593699728420024 - normalized_discounted_cumulative_gain@5(0): 0.6154495682979294 - mean_average_precision(0): 0.5673934176720364\n",
      "Epoch 10/30\n",
      "102/102 [==============================] - 13s 123ms/step - loss: 0.0232\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5881578906809148 - normalized_discounted_cumulative_gain@5(0): 0.6394262458370498 - mean_average_precision(0): 0.5951101717082729\n",
      "Epoch 11/30\n",
      "102/102 [==============================] - 13s 125ms/step - loss: 0.0207\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5881578906809148 - normalized_discounted_cumulative_gain@5(0): 0.6394262458370498 - mean_average_precision(0): 0.5951101717082729\n",
      "Epoch 11/30\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5613547113904753 - normalized_discounted_cumulative_gain@5(0): 0.6269139022609665 - mean_average_precision(0): 0.5775492945379688\n",
      "Epoch 12/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0221\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5702128102602414 - normalized_discounted_cumulative_gain@5(0): 0.6415978899464336 - mean_average_precision(0): 0.590659170212816\n",
      "Epoch 13/30\n",
      "102/102 [==============================] - 13s 123ms/step - loss: 0.0162\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5769510273981844 - normalized_discounted_cumulative_gain@5(0): 0.6339041239754759 - mean_average_precision(0): 0.5857824547697966\n",
      "Epoch 14/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0169\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5507207181913453 - normalized_discounted_cumulative_gain@5(0): 0.6113441552182128 - mean_average_precision(0): 0.564103775277576\n",
      "Epoch 15/30\n",
      "102/102 [==============================] - 14s 133ms/step - loss: 0.0244\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5847216045711233 - normalized_discounted_cumulative_gain@5(0): 0.636609385685753 - mean_average_precision(0): 0.5880658301166005\n",
      "Epoch 16/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0107\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5194273355616376 - normalized_discounted_cumulative_gain@5(0): 0.5858470108107657 - mean_average_precision(0): 0.5463461774614633\n",
      "Epoch 17/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0237\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5354493808134728 - normalized_discounted_cumulative_gain@5(0): 0.5988083553213212 - mean_average_precision(0): 0.5499018306563277\n",
      "Epoch 18/30\n",
      "102/102 [==============================] - 13s 123ms/step - loss: 0.0219\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5367279557445404 - normalized_discounted_cumulative_gain@5(0): 0.6014043512561841 - mean_average_precision(0): 0.5584394328564718\n",
      "Epoch 19/30\n",
      "101/102 [============================>.] - ETA: 0s - loss: 0.0189Validation: normalized_discounted_cumulative_gain@3(0): 0.5367279557445404 - normalized_discounted_cumulative_gain@5(0): 0.6014043512561841 - mean_average_precision(0): 0.5584394328564718\n",
      "102/102 [==============================] - 13s 132ms/step - loss: 0.0187\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5678018587592725 - normalized_discounted_cumulative_gain@5(0): 0.6317739720161761 - mean_average_precision(0): 0.5859995163398286\n",
      "Epoch 20/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0153\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5639109046968998 - normalized_discounted_cumulative_gain@5(0): 0.6231458887428801 - mean_average_precision(0): 0.5770061396643675\n",
      "Epoch 21/30\n",
      "102/102 [==============================] - 13s 124ms/step - loss: 0.0190\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5639109046968998 - normalized_discounted_cumulative_gain@5(0): 0.6231458887428801 - mean_average_precision(0): 0.5770061396643675\n",
      "Epoch 21/30\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5389674484775825 - normalized_discounted_cumulative_gain@5(0): 0.612937176471047 - mean_average_precision(0): 0.5610544788306019\n",
      "Epoch 22/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0124\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5649953589564279 - normalized_discounted_cumulative_gain@5(0): 0.6297954259255651 - mean_average_precision(0): 0.5852483348422166\n",
      "Epoch 23/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0213\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5787393082362849 - normalized_discounted_cumulative_gain@5(0): 0.632235971859535 - mean_average_precision(0): 0.5883712852065472\n",
      "Epoch 24/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0040\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5546765909918004 - normalized_discounted_cumulative_gain@5(0): 0.6046172376580535 - mean_average_precision(0): 0.5673391900655247\n",
      "Epoch 25/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0041\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5546867041558529 - normalized_discounted_cumulative_gain@5(0): 0.625208219019932 - mean_average_precision(0): 0.5713020295457005\n",
      "Epoch 26/30\n",
      "102/102 [==============================] - 12s 121ms/step - loss: 0.0092\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5314449046886515 - normalized_discounted_cumulative_gain@5(0): 0.6024514905728754 - mean_average_precision(0): 0.5570193278208605\n",
      "Epoch 27/30\n",
      "102/102 [==============================] - 13s 130ms/step - loss: 0.0086\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5363597288819961 - normalized_discounted_cumulative_gain@5(0): 0.6075801320241666 - mean_average_precision(0): 0.5566135777734567\n",
      "Epoch 28/30\n",
      "102/102 [==============================] - 12s 122ms/step - loss: 0.0174\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5652158445115879 - normalized_discounted_cumulative_gain@5(0): 0.6321715408570834 - mean_average_precision(0): 0.579656449692911\n",
      "Epoch 29/30\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "102/102 [==============================] - 13s 126ms/step - loss: 0.0153\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.566055820213695 - normalized_discounted_cumulative_gain@5(0): 0.6283494205190083 - mean_average_precision(0): 0.5831118521198323\n",
      "Epoch 30/30\n",
      "102/102 [==============================] - 13s 123ms/step - loss: 0.0054\n",
      "Validation: normalized_discounted_cumulative_gain@3(0): 0.5712552689173427 - normalized_discounted_cumulative_gain@5(0): 0.6259786277674115 - mean_average_precision(0): 0.5751640592363443\n"
     ]
    }
   ],
   "source": [
    "history = model.fit_generator(train_generator, epochs=30, callbacks=[evaluate], workers=30, use_multiprocessing=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
